by Alberto Diaz-Durana
adiazdurana@gmail.com
Date: 08.02.2021
import pandas as pd
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import emoji # This package is not necessary but it's fun!
import os
from os import walk
# Local folders to store files generated in this notebook
## run these two lines only if the directories are not in the same folder in which this notebook is running
#os.makedirs('_images')
#os.makedirs('_data_gen')
print(emoji.emojize("Let's do this!!! :thumbs_up:"))
Let's do this!!! 👍
cwd = os.getcwd()
f = []
for (dirpath, dirnames, filenames) in walk(cwd):
f.extend(filenames)
break
print(emoji.emojize(f"The current directory 📒 contains the subdirectories{dirnames} and the files :page_with_curl:: {filenames} "))
The current directory 📒 contains the subdirectories['.ipynb_checkpoints', '_data_gen', '_images'] and the files 📃: ['ADD-Preprocess event data for analysis.ipynb', 'Example MIX TYPES.txt', 'packagesADD.yml', 'Process event data for analysis.pdf', 'README.md']
First we will import the file Example MIX TYPES.txt and have a look at what we have.
fn = 'Example MIX TYPES.txt'
data = pd.read_csv(fn, sep='\t')
data.head(10)
| TransID | ActivityName | Time | eLetter_Type | eLetter_ID | ShapeID | ShapeNumber | OK_After_Review_14_perc | OK_After_Review_17_perc | OK_After_Review_25_perc | ... | Paste_In_Template_STD | Preempt | Priority | Pull_Target_List_MAX | Pull_Target_List_MIN | Review_13_AVG | Review_13_STD | Review_16_AVG | Review_16_STD | Send_On | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Request | 0.000000 | Type_1 | 1 | 767 | 1 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 1 | 1 | Request | 600.000000 | Type_1 | 1 | 767 | 1 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 2 | 1 | Start | 600.000000 | Type_1 | 1 | 1376 | 2 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 3 | 1 | Start | 600.000000 | Type_1 | 1 | 1376 | 2 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 4 | 1 | Schedule Activities | 600.000000 | Type_1 | 1 | 15 | 3 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 5 | 1 | Schedule Activities | 2062.294853 | Type_1 | 1 | 15 | 3 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 6 | 1 | Broadcast Request for Related Articles | 2062.294853 | Type_1 | 1 | 194 | 5 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 7 | 1 | Broadcast Request for Related Articles | 2962.294853 | Type_1 | 1 | 194 | 5 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 8 | 1 | Delay | 2962.294853 | Type_1 | 1 | 1022 | 6 | 100 | 100 | 100 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
| 9 | 2 | Request | 7200.000000 | Type_2 | 2 | 767 | 1 | 80 | 85 | 25 | ... | 10 | False | 0 | 2 | 1 | 30 | 15 | 30 | 15 | 10 |
10 rows × 37 columns
data.to_csv('_data_gen/_data.csv', sep='\t') #to visualize the complete table in Excel and to play around with the values
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3267 entries, 0 to 3266 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TransID 3267 non-null int64 1 ActivityName 3267 non-null object 2 Time 3267 non-null float64 3 eLetter_Type 3267 non-null object 4 eLetter_ID 3267 non-null int64 5 ShapeID 3267 non-null int64 6 ShapeNumber 3267 non-null int64 7 OK_After_Review_14_perc 3267 non-null int64 8 OK_After_Review_17_perc 3267 non-null int64 9 OK_After_Review_25_perc 3267 non-null int64 10 ProcessName 3267 non-null object 11 Analyze_Criteria_AVG 3267 non-null int64 12 Analyze_Criteria_STD 3267 non-null int64 13 Create_Initial_Draft_MAX 3267 non-null int64 14 Create_Initial_Draft_MIN 3267 non-null int64 15 Create_Proof_AVG 3267 non-null int64 16 Create_Proof_STD 3267 non-null int64 17 Delta_Time 3267 non-null int64 18 Gather_Materials 3267 non-null int64 19 Internal_Review_MAX 3267 non-null int64 20 Internal_Review_MIN 3267 non-null int64 21 Modify_Criteria_AVG 3267 non-null int64 22 Modify_Criteria_STD 3267 non-null int64 23 Optimize_Graphics_MAX 3267 non-null int64 24 Optimize_Graphics_MIN 3267 non-null int64 25 Outline_Text 3267 non-null int64 26 Paste_In_Template_AVG 3267 non-null int64 27 Paste_In_Template_STD 3267 non-null int64 28 Preempt 3267 non-null bool 29 Priority 3267 non-null int64 30 Pull_Target_List_MAX 3267 non-null int64 31 Pull_Target_List_MIN 3267 non-null int64 32 Review_13_AVG 3267 non-null int64 33 Review_13_STD 3267 non-null int64 34 Review_16_AVG 3267 non-null int64 35 Review_16_STD 3267 non-null int64 36 Send_On 3267 non-null int64 dtypes: bool(1), float64(1), int64(32), object(3) memory usage: 922.2+ KB
print('{} has {} rows and {} columns.'.format(fn, data.shape[0], data.shape[1]))
Example MIX TYPES.txt has 3267 rows and 37 columns.
The eLetter_ID is the case identifier and the ActivityName is the activity.
According to the description given in the assigment, each event has to have at least a case identifier (TransID in the data), a name of the activity that has been executed (ActivityName), a start and a complete timestamp (which will be calculated later on in this notebook). Optionally, an event can have an arbitrary number of attributes (variables or columns in the DataFrame).
# Visualize the list of any event in a column
numberOfEvents = len(data['Time'])
print(emoji.emojize(f"The the Event Log :page_with_curl: has total number of {numberOfEvents} events (rows)."))
The the Event Log 📃 has total number of 3267 events (rows).
To obtain the number of events for each activity name we have to count the number of observations of case identifiers for each ActivityName.
data.groupby('ActivityName')['TransID'].count()
ActivityName Create Initial Draft 125 Analyze Criteria 108 Broadcast Request for Related Articles 164 Change 4 Create Initial Target List 118 Create Proof 117 Delay 152 End 102 Gather Materials 140 Internal Review And Changes 137 Modify Criteria 22 OK? 352 Optimize Graphics 136 Outline Text, Targeting, and URLs 190 Paste In Template 134 Pull Target List 106 Request 176 Review 247 Schedule Activities 176 Send Email 161 Send On 114 Send To Production 110 Start 176 Name: TransID, dtype: int64
The number of cases is the ammount of unique case identifiers. Notice also that the case identifiers are go from 1 to 151. So, an alternative solution is to find the heighest number in this column.
print(emoji.emojize(f"The number of cases is {data['TransID'].nunique()} :books:"))
The number of cases is 151 📚
print(emoji.emojize(f"The values in the eLetter_Type column are {data['eLetter_Type'].unique().tolist()} :ledger:"))
The values in the eLetter_Type column are ['Type_1', 'Type_2', 'Type_4'] 📒
# The number of occurences (how often?) of each eLetter_Type attribute are:
data.groupby(['eLetter_Type'])[['TransID']].count()
| TransID | |
|---|---|
| eLetter_Type | |
| Type_1 | 1598 |
| Type_2 | 901 |
| Type_4 | 768 |
Our client provided the fix point time indicating the time stamp at which all begins (the genesis 😄). It’s the 01.01.2016 10:00:00
fixPointTime = pd.Timestamp(2016, 1, 1, 10)
fixPointTime
Timestamp('2016-01-01 10:00:00')
The variable 'timeStamp' is obtained by adding the Time from the attribute 'Time' to the variable fixPointTime
data['timeStamp'] = fixPointTime + pd.to_timedelta(data['Time'], unit='s')
data['timeStamp']
0 2016-01-01 10:00:00.000000
1 2016-01-01 10:10:00.000000
2 2016-01-01 10:10:00.000000
3 2016-01-01 10:10:00.000000
4 2016-01-01 10:10:00.000000
...
3262 2016-01-31 00:52:24.198348
3263 2016-01-31 00:54:36.096105
3264 2016-01-31 00:54:36.096105
3265 2016-01-31 01:46:48.605888
3266 2016-01-31 01:46:48.605888
Name: timeStamp, Length: 3267, dtype: datetime64[ns]
#data.dtypes
A first step in preparing the event log is to calculate the relative time of each event, which is time the event occurs with respect to the beginning of the process.
# Create a pivot table of the start (minimum) and end (maximum) timestamps associated with each case:
case_starts_ends = data.pivot_table(index=['TransID','ActivityName'], aggfunc={'timeStamp': ['min', 'max']})
case_starts_ends = case_starts_ends.reset_index()
case_starts_ends.columns = ['TransID','ActivityName', 'caseEnd', 'caseStart']
# Merge with the main event log data so that for each row we have the start and end times.
data = data.merge(case_starts_ends, on=['TransID','ActivityName'])
Now we inspect the DataFrame to check on the new start and end timestamps.
data[['TransID','ActivityName', 'timeStamp','Time','caseEnd', 'caseStart']].head(20)
| TransID | ActivityName | timeStamp | Time | caseEnd | caseStart | |
|---|---|---|---|---|---|---|
| 0 | 1 | Request | 2016-01-01 10:00:00.000000 | 0.000000 | 2016-01-01 10:10:00.000000 | 2016-01-01 10:00:00.000000 |
| 1 | 1 | Request | 2016-01-01 10:10:00.000000 | 600.000000 | 2016-01-01 10:10:00.000000 | 2016-01-01 10:00:00.000000 |
| 2 | 1 | Start | 2016-01-01 10:10:00.000000 | 600.000000 | 2016-01-01 10:10:00.000000 | 2016-01-01 10:10:00.000000 |
| 3 | 1 | Start | 2016-01-01 10:10:00.000000 | 600.000000 | 2016-01-01 10:10:00.000000 | 2016-01-01 10:10:00.000000 |
| 4 | 1 | Schedule Activities | 2016-01-01 10:10:00.000000 | 600.000000 | 2016-01-01 10:34:22.294853 | 2016-01-01 10:10:00.000000 |
| 5 | 1 | Schedule Activities | 2016-01-01 10:34:22.294853 | 2062.294853 | 2016-01-01 10:34:22.294853 | 2016-01-01 10:10:00.000000 |
| 6 | 1 | Broadcast Request for Related Articles | 2016-01-01 10:34:22.294853 | 2062.294853 | 2016-01-01 10:49:22.294853 | 2016-01-01 10:34:22.294853 |
| 7 | 1 | Broadcast Request for Related Articles | 2016-01-01 10:49:22.294853 | 2962.294853 | 2016-01-01 10:49:22.294853 | 2016-01-01 10:34:22.294853 |
| 8 | 1 | Delay | 2016-01-01 10:49:22.294853 | 2962.294853 | 2016-01-01 18:39:23.936862 | 2016-01-01 10:49:22.294853 |
| 9 | 1 | Delay | 2016-01-01 18:39:23.936862 | 31163.936862 | 2016-01-01 18:39:23.936862 | 2016-01-01 10:49:22.294853 |
| 10 | 2 | Request | 2016-01-01 12:00:00.000000 | 7200.000000 | 2016-01-01 12:10:00.000000 | 2016-01-01 12:00:00.000000 |
| 11 | 2 | Request | 2016-01-01 12:10:00.000000 | 7800.000000 | 2016-01-01 12:10:00.000000 | 2016-01-01 12:00:00.000000 |
| 12 | 2 | Start | 2016-01-01 12:10:00.000000 | 7800.000000 | 2016-01-01 12:10:00.000000 | 2016-01-01 12:10:00.000000 |
| 13 | 2 | Start | 2016-01-01 12:10:00.000000 | 7800.000000 | 2016-01-01 12:10:00.000000 | 2016-01-01 12:10:00.000000 |
| 14 | 2 | Schedule Activities | 2016-01-01 12:10:00.000000 | 7800.000000 | 2016-01-01 12:26:22.455046 | 2016-01-01 12:10:00.000000 |
| 15 | 2 | Schedule Activities | 2016-01-01 12:26:22.455046 | 8782.455046 | 2016-01-01 12:26:22.455046 | 2016-01-01 12:10:00.000000 |
| 16 | 2 | Broadcast Request for Related Articles | 2016-01-01 12:26:22.455046 | 8782.455046 | 2016-01-01 12:41:22.455046 | 2016-01-01 12:26:22.455046 |
| 17 | 2 | Broadcast Request for Related Articles | 2016-01-01 12:41:22.455046 | 9682.455046 | 2016-01-01 12:41:22.455046 | 2016-01-01 12:26:22.455046 |
| 18 | 2 | Delay | 2016-01-01 12:41:22.455046 | 9682.455046 | 2016-01-01 19:48:41.041293 | 2016-01-01 12:41:22.455046 |
| 19 | 2 | Delay | 2016-01-01 19:48:41.041293 | 35321.041293 | 2016-01-01 19:48:41.041293 | 2016-01-01 12:41:22.455046 |
Notice that the values in ActivityName are still doubled for each values in TransID. The next step is to eliminate the duplicas leaving the rows showing the relative time relativeTime_s.
dataT = data.drop_duplicates(subset = ['TransID','ActivityName'], keep='last').reset_index(drop=True)
dataT[['TransID','ActivityName', 'caseStart','caseEnd']].head(10)
| TransID | ActivityName | caseStart | caseEnd | |
|---|---|---|---|---|
| 0 | 1 | Request | 2016-01-01 10:00:00.000000 | 2016-01-01 10:10:00.000000 |
| 1 | 1 | Start | 2016-01-01 10:10:00.000000 | 2016-01-01 10:10:00.000000 |
| 2 | 1 | Schedule Activities | 2016-01-01 10:10:00.000000 | 2016-01-01 10:34:22.294853 |
| 3 | 1 | Broadcast Request for Related Articles | 2016-01-01 10:34:22.294853 | 2016-01-01 10:49:22.294853 |
| 4 | 1 | Delay | 2016-01-01 10:49:22.294853 | 2016-01-01 18:39:23.936862 |
| 5 | 2 | Request | 2016-01-01 12:00:00.000000 | 2016-01-01 12:10:00.000000 |
| 6 | 2 | Start | 2016-01-01 12:10:00.000000 | 2016-01-01 12:10:00.000000 |
| 7 | 2 | Schedule Activities | 2016-01-01 12:10:00.000000 | 2016-01-01 12:26:22.455046 |
| 8 | 2 | Broadcast Request for Related Articles | 2016-01-01 12:26:22.455046 | 2016-01-01 12:41:22.455046 |
| 9 | 2 | Delay | 2016-01-01 12:41:22.455046 | 2016-01-01 19:48:41.041293 |
dataT.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1565 entries, 0 to 1564 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TransID 1565 non-null int64 1 ActivityName 1565 non-null object 2 Time 1565 non-null float64 3 eLetter_Type 1565 non-null object 4 eLetter_ID 1565 non-null int64 5 ShapeID 1565 non-null int64 6 ShapeNumber 1565 non-null int64 7 OK_After_Review_14_perc 1565 non-null int64 8 OK_After_Review_17_perc 1565 non-null int64 9 OK_After_Review_25_perc 1565 non-null int64 10 ProcessName 1565 non-null object 11 Analyze_Criteria_AVG 1565 non-null int64 12 Analyze_Criteria_STD 1565 non-null int64 13 Create_Initial_Draft_MAX 1565 non-null int64 14 Create_Initial_Draft_MIN 1565 non-null int64 15 Create_Proof_AVG 1565 non-null int64 16 Create_Proof_STD 1565 non-null int64 17 Delta_Time 1565 non-null int64 18 Gather_Materials 1565 non-null int64 19 Internal_Review_MAX 1565 non-null int64 20 Internal_Review_MIN 1565 non-null int64 21 Modify_Criteria_AVG 1565 non-null int64 22 Modify_Criteria_STD 1565 non-null int64 23 Optimize_Graphics_MAX 1565 non-null int64 24 Optimize_Graphics_MIN 1565 non-null int64 25 Outline_Text 1565 non-null int64 26 Paste_In_Template_AVG 1565 non-null int64 27 Paste_In_Template_STD 1565 non-null int64 28 Preempt 1565 non-null bool 29 Priority 1565 non-null int64 30 Pull_Target_List_MAX 1565 non-null int64 31 Pull_Target_List_MIN 1565 non-null int64 32 Review_13_AVG 1565 non-null int64 33 Review_13_STD 1565 non-null int64 34 Review_16_AVG 1565 non-null int64 35 Review_16_STD 1565 non-null int64 36 Send_On 1565 non-null int64 37 timeStamp 1565 non-null datetime64[ns] 38 caseEnd 1565 non-null datetime64[ns] 39 caseStart 1565 non-null datetime64[ns] dtypes: bool(1), datetime64[ns](3), float64(1), int64(32), object(3) memory usage: 478.5+ KB
# Visualize the list of any event in a column
numberOfEventsT = len(dataT['Time'])
print(emoji.emojize(f"The the Event Log :page_with_curl: has now total number of {numberOfEventsT} events (rows) after the transformation."))
print(emoji.emojize(f"We can compare to a total number of {numberOfEvents} events (rows) it had before the transformation."))
The the Event Log 📃 has now total number of 1565 events (rows) after the transformation. We can compare to a total number of 3267 events (rows) it had before the transformation.
We count the number of observations of case identifiers for each ActivityName after the transformation.
dataT.groupby('ActivityName')['TransID'].count()
ActivityName Create Initial Draft 63 Analyze Criteria 55 Broadcast Request for Related Articles 88 Change 2 Create Initial Target List 63 Create Proof 59 Delay 76 End 51 Gather Materials 76 Internal Review And Changes 62 Modify Criteria 11 OK? 112 Optimize Graphics 61 Outline Text, Targeting, and URLs 127 Paste In Template 61 Pull Target List 53 Request 88 Review 59 Schedule Activities 88 Send Email 110 Send On 57 Send To Production 55 Start 88 Name: TransID, dtype: int64
print(emoji.emojize(f"The number of cases for the transformed data is {dataT['TransID'].nunique()} :books:"))
The number of cases for the transformed data is 151 📚
print(emoji.emojize(f"The values in the eLetter_Type column for the transformed data are {dataT['eLetter_Type'].unique().tolist()} :ledger:"))
The values in the eLetter_Type column for the transformed data are ['Type_1', 'Type_2', 'Type_4'] 📒
# The number of occurences (how often?) of each eLetter_Type attribute are:
dataT.groupby(['eLetter_Type'])[['TransID']].count()
| TransID | |
|---|---|
| eLetter_Type | |
| Type_1 | 781 |
| Type_2 | 406 |
| Type_4 | 378 |
pd.crosstab(dataT['ActivityName'], dataT['eLetter_Type']).sort_values(by=['Type_1'], ascending=False)
| eLetter_Type | Type_1 | Type_2 | Type_4 |
|---|---|---|---|
| ActivityName | |||
| Outline Text, Targeting, and URLs | 63 | 32 | 32 |
| OK? | 57 | 28 | 27 |
| Send Email | 56 | 28 | 26 |
| Schedule Activities | 44 | 22 | 22 |
| Request | 44 | 22 | 22 |
| Start | 44 | 22 | 22 |
| Broadcast Request for Related Articles | 44 | 22 | 22 |
| Delay | 38 | 19 | 19 |
| Gather Materials | 38 | 19 | 19 |
| Optimize Graphics | 31 | 16 | 14 |
| Create Initial Target List | 31 | 16 | 16 |
| Paste In Template | 31 | 16 | 14 |
| Internal Review And Changes | 31 | 16 | 15 |
| Create Initial Draft | 31 | 16 | 16 |
| Create Proof | 30 | 15 | 14 |
| Review | 30 | 15 | 14 |
| Send On | 29 | 15 | 13 |
| Analyze Criteria | 28 | 14 | 13 |
| Send To Production | 28 | 14 | 13 |
| Pull Target List | 27 | 13 | 13 |
| End | 26 | 13 | 12 |
| Modify Criteria | 0 | 11 | 0 |
| Change | 0 | 2 | 0 |
fig = px.bar(dataT, x='ActivityName', y='eLetter_Type')
fig.show()
We can verify that the values in caseStart and in caseEnd were obtained by adding the values in Time to the variable fixPointTime by calculating the relative time between events.
# Calculate the relative time by subtracting the process start time from the event timestamp
dataT['relativeTime'] = dataT['timeStamp'] - dataT['caseStart']
# Convert relative times to seconds to compare with the attribute Time
dataT['relativeTime_s'] = dataT['relativeTime'].dt.seconds + 86400*dataT['relativeTime'].dt.days
dataT.loc[650:670,['TransID','ActivityName', 'Time','caseEnd','caseStart','relativeTime_s']]
| TransID | ActivityName | Time | caseEnd | caseStart | relativeTime_s | |
|---|---|---|---|---|---|---|
| 650 | 56 | Delay | 9.986799e+05 | 2016-01-12 23:24:39.942867 | 2016-01-12 16:00:29.849801 | 26650 |
| 651 | 42 | Optimize Graphics | 1.580302e+06 | 2016-01-19 16:58:21.821436 | 2016-01-12 16:04:51.690925 | 608010 |
| 652 | 66 | Start | 9.726000e+05 | 2016-01-12 16:10:00.000000 | 2016-01-12 16:10:00.000000 | 0 |
| 653 | 66 | Schedule Activities | 9.736958e+05 | 2016-01-12 16:28:15.839854 | 2016-01-12 16:10:00.000000 | 1095 |
| 654 | 57 | Delay | 9.958692e+05 | 2016-01-12 22:37:49.245274 | 2016-01-12 16:15:29.849801 | 22939 |
| 655 | 66 | Broadcast Request for Related Articles | 1.297431e+06 | 2016-01-16 10:23:50.671186 | 2016-01-12 16:28:15.839854 | 323734 |
| 656 | 59 | Delay | 1.002509e+06 | 2016-01-13 00:28:28.514177 | 2016-01-12 16:30:29.849801 | 28678 |
| 657 | 58 | Send To Production | 9.746477e+05 | 2016-01-12 16:44:07.729100 | 2016-01-12 16:39:07.729100 | 300 |
| 658 | 40 | Send On | 9.750153e+05 | 2016-01-12 16:50:15.251711 | 2016-01-12 16:40:15.251711 | 600 |
| 659 | 58 | Analyze Criteria | 1.215643e+06 | 2016-01-15 11:40:42.853685 | 2016-01-12 16:44:07.729100 | 240995 |
| 660 | 40 | Send Email | 1.296650e+06 | 2016-01-16 10:10:49.937815 | 2016-01-12 16:50:15.251711 | 321634 |
| 661 | 60 | Send To Production | 9.760083e+05 | 2016-01-12 17:06:48.322180 | 2016-01-12 17:01:48.322180 | 300 |
| 662 | 55 | Send Email | 1.384622e+06 | 2016-01-17 10:37:01.986196 | 2016-01-12 17:06:08.561697 | 408653 |
| 663 | 60 | Analyze Criteria | 1.221098e+06 | 2016-01-15 13:11:37.756386 | 2016-01-12 17:06:48.322180 | 245089 |
| 664 | 41 | Paste In Template | 9.799451e+05 | 2016-01-12 18:12:25.131146 | 2016-01-12 17:15:13.404552 | 3431 |
| 665 | 62 | Delay | 1.002550e+06 | 2016-01-13 00:29:10.050702 | 2016-01-12 17:21:48.322180 | 25641 |
| 666 | 47 | Outline Text, Targeting, and URLs | 9.784083e+05 | 2016-01-12 17:46:48.322180 | 2016-01-12 17:31:48.322180 | 900 |
| 667 | 42 | Paste In Template | 1.583900e+06 | 2016-01-19 17:58:20.213844 | 2016-01-12 17:34:21.225642 | 606238 |
| 668 | 61 | Send To Production | 9.779868e+05 | 2016-01-12 17:39:46.815648 | 2016-01-12 17:34:46.815648 | 300 |
| 669 | 61 | Analyze Criteria | 1.222459e+06 | 2016-01-15 13:34:19.375406 | 2016-01-12 17:39:46.815648 | 244472 |
| 670 | 47 | Create Initial Draft | 1.217425e+06 | 2016-01-15 12:10:24.621152 | 2016-01-12 17:46:48.322180 | 239016 |
Notice that
dataT.to_csv('_data_gen/_dataTransformed.csv', sep='\t') #to visualize the complete table in Excel and to play around with the values
dataT['relativeTime_s'].describe()
count 1.565000e+03 mean 5.167660e+04 std 1.179131e+05 min 0.000000e+00 25% 3.000000e+02 50% 2.700000e+03 75% 4.990200e+04 max 1.098863e+06 Name: relativeTime_s, dtype: float64
plt.figure(figsize = (16,8))
plt.xlabel("Entry number")
plt.ylabel("Time in seconds")
dataT['relativeTime_s'].plot()
<AxesSubplot:xlabel='Entry number', ylabel='Time in seconds'>
fig = px.box(dataT, x="ActivityName", y="relativeTime_s", color="eLetter_Type")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()
Filtering events
One final thing will want to look at is which events are shared by all processes and which are not, since in process mining it is the non-shared differentiating events that we are interested in.
## Create a table giving the number of cases in which each event is present.
dataT_events = pd.crosstab(dataT['TransID'], dataT['ActivityName']) ## Visualise in a heatmap
plt.figure(figsize = (16,8))
sns.heatmap(dataT_events, cmap="YlGnBu")## Calculate the number of unique event counts
## This should be 1 for events which are shared by all TransID.
nunique = dataT_events.apply(pd.Series.nunique) ## Identify the events which are shared by all
sharedActivities = nunique[nunique==1].index
activitiesToKeep = nunique[nunique>1].index
print('The following activities are common to all cases: {}'.format(', '.join(sharedActivities)))
print('The following activities are not common to all cases: {}'.format(', '.join(activitiesToKeep)))
The following activities are common to all cases: The following activities are not common to all cases: Create Initial Draft, Analyze Criteria, Broadcast Request for Related Articles, Change, Create Initial Target List, Create Proof, Delay, End, Gather Materials, Internal Review And Changes, Modify Criteria, OK?, Optimize Graphics, Outline Text, Targeting, and URLs, Paste In Template, Pull Target List, Request, Review, Schedule Activities, Send Email, Send On, Send To Production, Start
print(emoji.emojize(f"All activities are not common to all cases :exploding_head:"))
All activities are not common to all cases 🤯
The selected package is twint from https://github.com/twintproject/twint to scrape and to store the tweets in a .json file for each day the process has been running.
To install and be able to run the package in this notebook we will clone the repository as explained in the url above.
import twint
import nest_asyncio
from datetime import datetime
nest_asyncio.apply()
print(twint.__version__)
2.1.20
Uncomment the cell in this subchapter if you would like to to the entire process of scrapping the tweets and placing them in a folder called "penguin". The scrapped tweets in a day will be stored in a json file with the date as filename.
The cell below creates several functions to automate the process of searching over several days and storing each day’s results as distinct json file: twint_loop splits the date range into a series of days and calls twint_search to do the searching for each date. Each json is named after the date and stored in a directory based on the search term, using clean_name to ensure that it is a valide directory name. The date loop
# Credits to Neal Caren for the functions!
# Scraping has to be done only once!
# All files are available in the folder "penguin"
from datetime import timedelta
from string import ascii_letters, digits
from os import mkdir, path
'''
def clean_name(dirname):
valid = set(ascii_letters + digits)
return ''.join(a for a in dirname if a in valid)
def twint_search(searchterm, since, until, json_name):
#Twint search for a specific date range.
#Stores results to json.
c = twint.Config()
c.Search = searchterm
c.Since = since
c.Until = until
c.Hide_output = True
c.Store_json = True
c.Output = json_name
c.Debug = True
try:
twint.run.Search(c)
except (KeyboardInterrupt, SystemExit):
raise
except:
print("Problem with %s." % since)
def twint_loop(searchterm, since, until):
dirname = clean_name(searchterm)
try:
# Create target Directory
mkdir(dirname)
print("Directory" , dirname , "Created ")
except FileExistsError:
print("Directory" , dirname , "already exists")
daterange = pd.date_range(since, until)
for start_date in daterange:
since= start_date.strftime("%Y-%m-%d")
until = (start_date + timedelta(days=1)).strftime("%Y-%m-%d")
json_name = '%s.json' % since
json_name = path.join(dirname, json_name)
print('Getting %s ' % since )
twint_search(searchterm, since, until, json_name)
'''
'\ndef clean_name(dirname):\n valid = set(ascii_letters + digits)\n return \'\'.join(a for a in dirname if a in valid)\n\n\ndef twint_search(searchterm, since, until, json_name):\n \n\n\n #Twint search for a specific date range.\n #Stores results to json.\n\n \n c = twint.Config()\n c.Search = searchterm\n c.Since = since\n c.Until = until\n c.Hide_output = True\n c.Store_json = True\n c.Output = json_name\n c.Debug = True\n\n try:\n twint.run.Search(c) \n except (KeyboardInterrupt, SystemExit):\n raise\n except:\n print("Problem with %s." % since)\n\n\n\n\ndef twint_loop(searchterm, since, until):\n\n dirname = clean_name(searchterm)\n try:\n # Create target Directory\n mkdir(dirname)\n print("Directory" , dirname , "Created ")\n except FileExistsError:\n print("Directory" , dirname , "already exists")\n\n daterange = pd.date_range(since, until)\n\n for start_date in daterange:\n\n since= start_date.strftime("%Y-%m-%d")\n until = (start_date + timedelta(days=1)).strftime("%Y-%m-%d")\n\n json_name = \'%s.json\' % since\n json_name = path.join(dirname, json_name)\n\n print(\'Getting %s \' % since )\n twint_search(searchterm, since, until, json_name)\n'
'''
#The list the contents of the new directory confirms that it worked.
# This function ingests the all the json files found in the directory "penguin" created with the functions above.
from glob import glob
glob(path.join('penguin','*.json'))'''
'\n#The list the contents of the new directory confirms that it worked.\n# This function ingests the all the json files found in the directory "penguin" created with the functions above.\nfrom glob import glob\nglob(path.join(\'penguin\',\'*.json\'))'
'''
# Now we loop over the DataFrame using the start and end date of each event.
for index, row in data.iterrows():
twint_loop('#penguin', row['caseStart'], row['caseEnd'])'''
"\n# Now we loop over the DataFrame using the start and end date of each event.\nfor index, row in data.iterrows():\n twint_loop('#penguin', row['caseStart'], row['caseEnd'])"
'''
#Finally, the separate data files can be combined into a single dataframe.
file_names = glob(path.join('penguin','*.json'))
dfs = [pd.read_json(fn, lines = True) for fn in file_names]
penguins_df = pd.concat(dfs)
penguins_df.info()'''
"\n#Finally, the separate data files can be combined into a single dataframe.\nfile_names = glob(path.join('penguin','*.json'))\ndfs = [pd.read_json(fn, lines = True) for fn in file_names]\npenguins_df = pd.concat(dfs)\n\npenguins_df.info()"
Comment the cell in this subchapter if you would like to run the cells above and see how twitter gets scrapped!
If you decide to use this workaround, a csv file will be downloaded from a url and stored into a DataFrame.
penguins_df = pd.read_csv("https://www.dropbox.com/s/np78fr2kwcp19bw/penguins_df.csv?dl=1")
tweets_only = penguins_df['tweet']
from textblob import TextBlob
# Create textblob objects of the tweets
sentiment_objects = [TextBlob(tweet) for tweet in tweets_only]
sentiment_objects[0].polarity, sentiment_objects[0]
(0.5,
TextBlob("I just couldn't resist! #cute #penguin #lighting #victoriacentre #Nottingham https://t.co/2l2qZpUKvE"))
The next cell takes about 5 minutes... So let't go get a cup of tea ☕
# Create list of polarity valuesx and tweet text
sentiment_values = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objects]
sentiment_values[0]
[0.5, "I just couldn't resist! #cute #penguin #lighting #victoriacentre #Nottingham https://t.co/2l2qZpUKvE"]
# Create dataframe containing the polarity value and tweet text
sentiment_df = pd.DataFrame(sentiment_values, columns=["polarity", "tweet"])
sentiment_df.head(20)
| polarity | tweet | |
|---|---|---|
| 0 | 0.500000 | I just couldn't resist! #cute #penguin #light... |
| 1 | 0.000000 | Disco penguins #Georgetown #DC #Christmas #hap... |
| 2 | 0.000000 | Newport Aquarium today! #penguin 🐧 @ Newport A... |
| 3 | 0.500000 | #review #Penguin #Run is one of the many #andr... |
| 4 | 0.250000 | 🐧Who was the Penguins favorite relative? Aunt-... |
| 5 | 0.000000 | @robinlordtaylor miss Gotham! #Penguin♡ |
| 6 | 0.108333 | The Boys... Together. Awwww.... I wanna go to ... |
| 7 | 0.000000 | #rico #ricothepenguin #penguinsofmadagascar #g... |
| 8 | 0.000000 | #buschgardens #tampa #florida #zoo #animals #p... |
| 9 | 0.485227 | Happy New Years everyone! #art #illustration #... |
| 10 | 0.000000 | #penguin #happynewyears #2016 https://t.co/Cx... |
| 11 | 0.235606 | Light up the new year with our new #DIY #Pengu... |
| 12 | 0.000000 | My goal for today was to see penguins. Objecti... |
| 13 | 0.000000 | Hawk shoes. Penguin socks. #iowa #penguin h... |
| 14 | 0.000000 | Percy Penguin has joined the family!! Thank yo... |
| 15 | 0.400000 | A cute #penguin to start the #newyear with a #... |
| 16 | 0.000000 | Club #Penguin #August #2013 #Furniture #Catalo... |
| 17 | 0.000000 | #penguin #seaworldorlando #penguino https://t... |
| 18 | 0.000000 | Origami\rDay one\r#penguin 🐧 #origami https:/... |
| 19 | 0.000000 | #HappyNewYear2016 #FamilyMoments #FunnyFaces #... |
sentiment_df.head(20).sort_values(by=['polarity'], ascending=True)
| polarity | tweet | |
|---|---|---|
| 19 | 0.000000 | #HappyNewYear2016 #FamilyMoments #FunnyFaces #... |
| 14 | 0.000000 | Percy Penguin has joined the family!! Thank yo... |
| 13 | 0.000000 | Hawk shoes. Penguin socks. #iowa #penguin h... |
| 12 | 0.000000 | My goal for today was to see penguins. Objecti... |
| 10 | 0.000000 | #penguin #happynewyears #2016 https://t.co/Cx... |
| 18 | 0.000000 | Origami\rDay one\r#penguin 🐧 #origami https:/... |
| 7 | 0.000000 | #rico #ricothepenguin #penguinsofmadagascar #g... |
| 8 | 0.000000 | #buschgardens #tampa #florida #zoo #animals #p... |
| 5 | 0.000000 | @robinlordtaylor miss Gotham! #Penguin♡ |
| 17 | 0.000000 | #penguin #seaworldorlando #penguino https://t... |
| 2 | 0.000000 | Newport Aquarium today! #penguin 🐧 @ Newport A... |
| 1 | 0.000000 | Disco penguins #Georgetown #DC #Christmas #hap... |
| 16 | 0.000000 | Club #Penguin #August #2013 #Furniture #Catalo... |
| 6 | 0.108333 | The Boys... Together. Awwww.... I wanna go to ... |
| 11 | 0.235606 | Light up the new year with our new #DIY #Pengu... |
| 4 | 0.250000 | 🐧Who was the Penguins favorite relative? Aunt-... |
| 15 | 0.400000 | A cute #penguin to start the #newyear with a #... |
| 9 | 0.485227 | Happy New Years everyone! #art #illustration #... |
| 3 | 0.500000 | #review #Penguin #Run is one of the many #andr... |
| 0 | 0.500000 | I just couldn't resist! #cute #penguin #light... |
These polarity values can be plotted in a histogram, which can help to highlight in the overall sentiment (i.e. more positivity or negativity) toward the subject.
fig, ax = plt.subplots(figsize=(8, 6))
# Plot histogram of the polarity values
sentiment_df.hist(bins=[-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1],
ax=ax,
color="purple")
plt.title("Sentiments from Tweets on penguins")
plt.show()
# Remove polarity values equal to zero
sentiment_df = sentiment_df[sentiment_df.polarity != 0]
fig, ax = plt.subplots(figsize=(8, 6))
# Plot histogram with break at zero
sentiment_df.hist(bins=[-1, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1],
ax=ax,
color="purple")
plt.title("Sentiments from Tweets on penguins")
plt.show()
This plot displays a revised histogram of polarity values for tweets on penguins. For this histogram polarity values equal to zero have been removed to better highlight the distribution of polarity values.
Let's have a look now at the frequency of mentions per date according to the DataFrame we have just now generated.
dates = pd.to_datetime(penguins_df.date, cache=True)
times = pd.to_timedelta(penguins_df.time)
penguins_df['datetime'] = pd.DataFrame(dates + times)
penguins_df['datetime']
0 2016-01-02 00:45:42
1 2016-01-02 00:29:50
2 2016-01-02 00:29:28
3 2016-01-01 23:41:15
4 2016-01-01 23:17:57
...
780563 2016-01-31 15:07:51
780564 2016-01-31 15:07:48
780565 2016-01-31 15:01:15
780566 2016-01-31 14:50:27
780567 2016-01-31 14:50:00
Name: datetime, Length: 780568, dtype: datetime64[ns]
row=10
mask=(penguins_df["datetime"] >= dataT.loc[row,'caseStart']) & (penguins_df["datetime"] <= dataT.loc[row,'caseEnd'])
p_m = penguins_df[mask]
len(p_m)
46
penguinCounts = []
for index, row in dataT.iterrows():
mask =(penguins_df["datetime"] >= row['caseStart']) & (penguins_df["datetime"] <= row['caseEnd'])
penguinCount = len(penguins_df[mask])
penguinCounts.append(penguinCount)
penguinCounts_df = pd.DataFrame(penguinCounts)
penguinCounts_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1565 entries, 0 to 1564 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 0 1565 non-null int64 dtypes: int64(1) memory usage: 12.4 KB
dataT['penguinCounts'] = penguinCounts_df
plt.figure(figsize = (16,8))
dataT['penguinCounts'].plot()
<AxesSubplot:>
fig = px.scatter(dataT, x="penguinCounts", y="relativeTime_s")
fig.show()
fig = px.scatter(dataT, x="penguinCounts", y="ActivityName")
fig.show()
dataT.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1565 entries, 0 to 1564 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TransID 1565 non-null int64 1 ActivityName 1565 non-null object 2 Time 1565 non-null float64 3 eLetter_Type 1565 non-null object 4 eLetter_ID 1565 non-null int64 5 ShapeID 1565 non-null int64 6 ShapeNumber 1565 non-null int64 7 OK_After_Review_14_perc 1565 non-null int64 8 OK_After_Review_17_perc 1565 non-null int64 9 OK_After_Review_25_perc 1565 non-null int64 10 ProcessName 1565 non-null object 11 Analyze_Criteria_AVG 1565 non-null int64 12 Analyze_Criteria_STD 1565 non-null int64 13 Create_Initial_Draft_MAX 1565 non-null int64 14 Create_Initial_Draft_MIN 1565 non-null int64 15 Create_Proof_AVG 1565 non-null int64 16 Create_Proof_STD 1565 non-null int64 17 Delta_Time 1565 non-null int64 18 Gather_Materials 1565 non-null int64 19 Internal_Review_MAX 1565 non-null int64 20 Internal_Review_MIN 1565 non-null int64 21 Modify_Criteria_AVG 1565 non-null int64 22 Modify_Criteria_STD 1565 non-null int64 23 Optimize_Graphics_MAX 1565 non-null int64 24 Optimize_Graphics_MIN 1565 non-null int64 25 Outline_Text 1565 non-null int64 26 Paste_In_Template_AVG 1565 non-null int64 27 Paste_In_Template_STD 1565 non-null int64 28 Preempt 1565 non-null bool 29 Priority 1565 non-null int64 30 Pull_Target_List_MAX 1565 non-null int64 31 Pull_Target_List_MIN 1565 non-null int64 32 Review_13_AVG 1565 non-null int64 33 Review_13_STD 1565 non-null int64 34 Review_16_AVG 1565 non-null int64 35 Review_16_STD 1565 non-null int64 36 Send_On 1565 non-null int64 37 timeStamp 1565 non-null datetime64[ns] 38 caseEnd 1565 non-null datetime64[ns] 39 caseStart 1565 non-null datetime64[ns] 40 relativeTime 1565 non-null timedelta64[ns] 41 relativeTime_s 1565 non-null int64 42 penguinCounts 1565 non-null int64 dtypes: bool(1), datetime64[ns](3), float64(1), int64(34), object(3), timedelta64[ns](1) memory usage: 515.2+ KB
You can then represent this model with a petri net and visualise it with the pm4py visualizer object from pm4py.visualization.petrinet.
dataPM = dataT[['TransID','ActivityName','eLetter_Type','timeStamp']]
dataPM.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1565 entries, 0 to 1564 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TransID 1565 non-null int64 1 ActivityName 1565 non-null object 2 eLetter_Type 1565 non-null object 3 timeStamp 1565 non-null datetime64[ns] dtypes: datetime64[ns](1), int64(1), object(2) memory usage: 49.0+ KB
dataPM = dataPM.rename(columns={'timeStamp': 'time:timestamp',
'TransID': 'case:concept:name', 'ActivityName': 'concept:name', 'eLetter_Type': 'org:resource'})
dataPM.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1565 entries, 0 to 1564 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 case:concept:name 1565 non-null int64 1 concept:name 1565 non-null object 2 org:resource 1565 non-null object 3 time:timestamp 1565 non-null datetime64[ns] dtypes: datetime64[ns](1), int64(1), object(2) memory usage: 49.0+ KB
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.variants import variants_filter
from pm4py.statistics.traces.log import case_statistics
from pm4py.algo.filtering.log.attributes import attributes_filter
# process mining
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
# viz
from pm4py.visualization.petrinet import visualizer as pn_visualizer
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.visualization.dfg import visualizer as dfg_visualization
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer
# misc
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.objects.conversion.dfg import converter as dfg_mining
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
## Convert to log format
log = log_converter.apply(dataPM)
Before applying one of the many process mining algorithms, it will be informative if we get some numbers describing our log and process. We will start by understanding: how many variants we have? how many cases in each variant?
A process variant is a unique path from the very beginning to the very end of the process
variants = variants_filter.get_variants(log)
print(f"We have:{len(variants)} variants in our log")
We have:16 variants in our log
## Let's try to understand how many cases do those variants have?
variants_count = case_statistics.get_variant_statistics(log)
variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
variants_count ## Printing the top 5 variants by case number
[{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft,Internal Review And Changes,Optimize Graphics,Paste In Template,Review,OK?,Create Proof,Send On,Send Email,End',
'count': 32},
{'variant': 'Outline Text, Targeting, and URLs,Create Initial Target List,Send To Production,Analyze Criteria,OK?,Pull Target List,Send Email',
'count': 27},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft,Internal Review And Changes,Optimize Graphics,Paste In Template,Review,OK?,Create Proof,Send On,Send Email',
'count': 23},
{'variant': 'Outline Text, Targeting, and URLs,Create Initial Target List,Send To Production,Analyze Criteria,OK?,Pull Target List,Send Email,End',
'count': 15},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials',
'count': 12},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles',
'count': 12},
{'variant': 'Outline Text, Targeting, and URLs,Create Initial Target List',
'count': 8},
{'variant': 'Outline Text, Targeting, and URLs,Create Initial Target List,Send To Production,Analyze Criteria,OK?,Modify Criteria,Pull Target List,Send Email',
'count': 7},
{'variant': 'Outline Text, Targeting, and URLs,Create Initial Target List,Send To Production,Analyze Criteria,OK?,Modify Criteria,Pull Target List,Send Email,End',
'count': 4},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft,Internal Review And Changes,Optimize Graphics,Paste In Template,Review,OK?,Create Proof,Change,Send On,Send Email',
'count': 2},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft,Internal Review And Changes,Optimize Graphics,Paste In Template,Review,OK?,Create Proof',
'count': 2},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft,Internal Review And Changes,Optimize Graphics,Paste In Template',
'count': 2},
{'variant': 'Outline Text, Targeting, and URLs,Create Initial Target List,Send To Production,Analyze Criteria',
'count': 2},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft,Internal Review And Changes',
'count': 1},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs, Create Initial Draft',
'count': 1},
{'variant': 'Request,Start,Schedule Activities,Broadcast Request for Related Articles,Delay,Gather Materials,Outline Text, Targeting, and URLs',
'count': 1}]
variant_count_df = pd.DataFrame(variants_count)
variant_count_df
| variant | count | |
|---|---|---|
| 0 | Request,Start,Schedule Activities,Broadcast Re... | 32 |
| 1 | Outline Text, Targeting, and URLs,Create Initi... | 27 |
| 2 | Request,Start,Schedule Activities,Broadcast Re... | 23 |
| 3 | Outline Text, Targeting, and URLs,Create Initi... | 15 |
| 4 | Request,Start,Schedule Activities,Broadcast Re... | 12 |
| 5 | Request,Start,Schedule Activities,Broadcast Re... | 12 |
| 6 | Outline Text, Targeting, and URLs,Create Initi... | 8 |
| 7 | Outline Text, Targeting, and URLs,Create Initi... | 7 |
| 8 | Outline Text, Targeting, and URLs,Create Initi... | 4 |
| 9 | Request,Start,Schedule Activities,Broadcast Re... | 2 |
| 10 | Request,Start,Schedule Activities,Broadcast Re... | 2 |
| 11 | Request,Start,Schedule Activities,Broadcast Re... | 2 |
| 12 | Outline Text, Targeting, and URLs,Create Initi... | 2 |
| 13 | Request,Start,Schedule Activities,Broadcast Re... | 1 |
| 14 | Request,Start,Schedule Activities,Broadcast Re... | 1 |
| 15 | Request,Start,Schedule Activities,Broadcast Re... | 1 |
variant_count_df_sum = variant_count_df.loc[0,'count'] + variant_count_df.loc[2,'count'] + variant_count_df.loc[9,'count']
variant_count_df_sum
57
print(f"Out of {len(log)} cases we have in our log, {variant_count_df_sum} of them (i.e {round(variant_count_df_sum/len(log)*100)}%) are in 1 variant.\nOnly 1 variant out of {len(variants)}.")
Out of 151 cases we have in our log, 57 of them (i.e 38.0%) are in 1 variant. Only 1 variant out of 16.
## Let's see what activities do we have in log? including their frequencies and considering all cases(no filter)
activities = attributes_filter.get_attribute_values(log, "concept:name")
activities_df = pd.DataFrame(activities, index=['freq']).T.sort_values(by=['freq'], ascending=False)
activities_df
| freq | |
|---|---|
| Outline Text, Targeting, and URLs | 127 |
| OK? | 112 |
| Send Email | 110 |
| Request | 88 |
| Schedule Activities | 88 |
| Broadcast Request for Related Articles | 88 |
| Start | 88 |
| Delay | 76 |
| Gather Materials | 76 |
| Create Initial Draft | 63 |
| Create Initial Target List | 63 |
| Internal Review And Changes | 62 |
| Optimize Graphics | 61 |
| Paste In Template | 61 |
| Review | 59 |
| Create Proof | 59 |
| Send On | 57 |
| Send To Production | 55 |
| Analyze Criteria | 55 |
| Pull Target List | 53 |
| End | 51 |
| Modify Criteria | 11 |
| Change | 2 |
Few activities stands out they have a lot of actions, it could be some sort of self-loop or rework or some other reason.
The starting point for the Alpha algorithm are ordering relations (sorted by timestamp ofc) So, we do not consider the frequencies nor we consider other attributes!
## Import the alpha_miner algorithm
net, initial_marking, final_marking = alpha_miner.apply(log)
# Visualise
gviz = pn_visualizer.apply(net, initial_marking, final_marking)
pn_visualizer.view(gviz)
pn_visualizer.save(gviz,"_images/alpha_miner_petri_net.png")
parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
gviz = pn_visualizer.apply(net, initial_marking, final_marking,
parameters=parameters,
variant=pn_visualizer.Variants.FREQUENCY,
log=log)
pn_visualizer.view(gviz)
pn_visualizer.save(gviz, "_images/alpha_miner_petri_net_freq.png")
#Create graph from log
dfg = dfg_discovery.apply(log)
gviz = dfg_visualization.apply(dfg, log=log,
variant=dfg_visualization.Variants.FREQUENCY)
dfg_visualization.view(gviz)
dfg_visualization.save(gviz, "_images/Direct flows graph with frequency and time between the edges.png")
## Let's now take how much time it takes to travel between activities in our process
dfg = dfg_discovery.apply(log, variant=dfg_discovery.Variants.PERFORMANCE)
gviz = dfg_visualization.apply(dfg, log=log, variant=dfg_visualization.Variants.PERFORMANCE)
dfg_visualization.view(gviz)
dfg_visualization.save(gviz, "_images/Direct flows graph performance.png")
# Convert Directly-Follows Graph to a Workflow Net
net, im, fm = dfg_mining.apply(dfg)
gviz = pn_visualizer.apply(net, im, fm)
pn_visualizer.view(gviz)
pn_visualizer.save(gviz, "_images/Workflow Net.png")